{% load static %} StyleTransfer

Loading Libraries

In [1]:
import os
import librosa
from IPython.display import Audio,display
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
/home/mahidharv/.local/lib/python2.7/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
/usr/lib64/python2.7/site-packages/cffi/model.py:531: UserWarning: 'point_conversion_form_t' has no values explicitly defined; guessing that it is equivalent to 'unsigned int'
  % self._get_c_name())
In [2]:
import tensorflow as tf
sess = tf.Session()
In [4]:
os.listdir('.')
Out[4]:
['.ipynb_checkpoints',
 'Soundclassification.ipynb',
 'Soundclassification-testing.ipynb',
 'genres.tar.gz',
 'genres',
 'X_train_many.npy',
 'y_train_many.npy',
 'X_train.npy',
 'y_train.npy',
 'spec.png',
 'MidiBeethovenandMozart',
 'Text Generation With LSTM Recurrent Neural Networks in Python with Keras.ipynb',
 'X_train_many_features.npy',
 'Tere_Jaisa_Acoustic_489483_SongPK.co.in.mp3',
 'best_gs_pipeline.pkl',
 'BackGrounfFrmBasic.ipynb',
 'sultans_novoice1.wav',
 'Spec1.png',
 'midi.pyc',
 'Untitled Folder',
 'LSTM Seq-Seq Generation Music.ipynb',
 'log',
 'plot_vocal_separation.ipynb',
 '1_Introduction_to_Music_Unmixing.ipynb',
 'track.stem.mp4',
 'Sample1_background.wav',
 'Sample1_foreground.wav',
 'Sound_model_testing_files',
 'background.wav',
 'BackGroundExtraction.ipynb',
 's1.wav',
 'right.wav',
 'left.wav',
 'central.wav',
 'central_inv.wav',
 'background.midi',
 'background1.wav',
 'Function_to_decodebackgoi=omg.ipynb',
 'example.mid',
 'midi.py',
 'midi2audio',
 'python-musical',
 'Pipeline_of_Classifiers_GridSearch.ipynb',
 'y_train_many_features.npy',
 'Intro(Skit)_[songpk.co.in].mp3',
 'Dataset',
 'Play_music.ipynb',
 'blues.00000.wav',
 'Spectograms_to_Audio.ipynb',
 'out.wav',
 'Untitled1.ipynb',
 'StyleTransfer.ipynb',
 'Untitled.ipynb',
 'sample.wav',
 'X_temp_all_songs.npy',
 'y_temp.npy',
 'blues.wav',
 'GAN.ipynb',
 'Spegrams',
 'GAN for Style Transfer.ipynb',
 'Style_Transfer.ipynb',
 'op.wav']

Loading Data from Google Drive

In [5]:
CONTENT_FILENAME = "./MidiBeethovenandMozart/beethovenminuetinG.wav"
STYLE_FILENAME = "./MidiBeethovenandMozart/Mozartminuetk2.wav" 
In [28]:
from  IPython.display import Audio,display
In [29]:
display(Audio(CONTENT_FILENAME))
display(Audio(STYLE_FILENAME))

Reading the Spectogram

In [6]:
N_FFT = 2048
def read_audio_spectum(filename):
    x, fs = librosa.load(filename)
    S = librosa.stft(x, N_FFT)
    print(S.shape)
    p = np.angle(S)
    
    S = np.log1p(np.abs(S[:,:500]))  
    return S, fs
In [7]:
a_content, fs = read_audio_spectum(CONTENT_FILENAME)
a_style, fs = read_audio_spectum(STYLE_FILENAME)

N_SAMPLES = a_content.shape[1]
N_CHANNELS = a_content.shape[0]
print(N_SAMPLES)
print(N_CHANNELS)
a_style = a_style[:N_CHANNELS, :N_SAMPLES]
print(a_content[:5,:5])
print(a_style[:5,:5])
print(a_style.shape)
(1025, 6140)
(1025, 1982)
500
1025
[[0.02599781 0.02834302 0.02829463 0.01995481 0.0129744 ]
 [0.0135795  0.0132571  0.01490314 0.0104982  0.00656421]
 [0.004386   0.00469993 0.00798183 0.01997093 0.02285104]
 [0.01147008 0.00862399 0.01391978 0.05348829 0.08228461]
 [0.01413614 0.02444095 0.03759475 0.07816111 0.10768934]]
[[0.05067649 0.05086407 0.04316699 0.02760406 0.01684787]
 [0.02035191 0.03050695 0.02409084 0.01385229 0.01285057]
 [0.00417648 0.00584911 0.00561274 0.00306599 0.00585378]
 [0.00452424 0.00751203 0.00635189 0.01653495 0.01611581]
 [0.02149358 0.03751043 0.04254709 0.02305712 0.04587249]]
(1025, 500)
In [8]:
print(a_style[0:,1])
[5.0864067e-02 3.0506948e-02 5.8491086e-03 ... 9.5884684e-07 1.0388161e-06
 8.0653183e-07]
In [9]:
plt.style.use('classic')
In [11]:
from librosa import display

plotting the Spectogram

In [12]:
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.title('Content')
librosa.display.specshow(a_content, y_axis='log',x_axis='time')
plt.colorbar(format='%+2.0f dB')

plt.subplot(1, 2, 2)
plt.title('Style')
librosa.display.specshow(a_style, y_axis='log',x_axis='time')
plt.colorbar(format='%+2.0f dB')
Out[12]:
<matplotlib.colorbar.Colorbar at 0xd7f3ed0>

Adding the Filter Initialization

In [13]:
N_FILTERS = 4096

a_content_tf = np.ascontiguousarray(a_content.T[None,None,:,:])
a_style_tf = np.ascontiguousarray(a_style.T[None,None,:,:])
In [14]:
from sys import stderr
In [15]:
std = np.sqrt(2) * np.sqrt(2.0 / ((N_CHANNELS + N_FILTERS) * 11))
kernel = np.random.randn(1, 11, N_CHANNELS, N_FILTERS)*std
x = tf.placeholder('float32', [1,1,N_SAMPLES,N_CHANNELS], name="x")
kernel_tf = tf.constant(kernel, name="kernel", dtype='float32')

Creating the Computational Graph

In [18]:
g = tf.Graph()
with g.as_default(), g.device('/cpu:0'), tf.Session() as sess:
    # data shape is "[batch, in_height, in_width, in_channels]",
    x = tf.placeholder('float32', [1,1,N_SAMPLES,N_CHANNELS], name="x")
    print(x)
    kernel_tf = tf.constant(kernel, name="kernel", dtype='float32')
    conv = tf.nn.conv2d(
        x,
        kernel_tf,
        strides=[1, 1, 1, 1],
        padding="VALID",
        name="conv")
    
    relu_conv = tf.nn.elu(conv)
    net = tf.layers.max_pooling2d(inputs=relu_conv, pool_size=[1, 1], strides=1)
    content_features = net.eval(feed_dict={x: a_content_tf})
    style_features = net.eval(feed_dict={x: a_style_tf})
    print(content_features.shape)
    print(style_features.shape)
    features = np.reshape(style_features, (-1, N_FILTERS))
    print(features.shape)
    style_gram = np.matmul(features.T, features) / N_SAMPLES
    print(style_gram.shape)
Tensor("x:0", shape=(1, 1, 500, 1025), dtype=float32, device=/device:CPU:0)
(1, 1, 490, 4096)
(1, 1, 490, 4096)
(490, 4096)
(4096, 4096)

Minimizing the Total Loss

In [19]:
from sys import stderr

ALPHA= 1e-2
learning_rate= 1e-3
iterations = 1000

result = None
with tf.Graph().as_default():

    # Build graph with variable input

    x = tf.Variable(np.random.randn(1,1,N_SAMPLES,N_CHANNELS).astype(np.float32)*1e-3, name="x")
    print(x.shape)
    kernel_tf = tf.constant(kernel, name="kernel", dtype='float32')
    conv = tf.nn.conv2d(
        x,
        kernel_tf,
        strides=[1, 1, 1, 1],
        padding="VALID",
        name="conv")
    
    
    relu_conv = tf.nn.elu(conv)
    net = tf.layers.max_pooling2d(inputs=relu_conv, pool_size=[1, 1], strides=1)
    print(net.get_shape)
    content_loss = ALPHA * 2 * tf.nn.l2_loss(
            net - content_features)

    style_loss = 0

    _, height, width, number = map(lambda i: i.value, net.get_shape())
    print("height=",height)
    print("width=",width)
    print("number=",number)
    size = height * width * number
    feats = tf.reshape(net, (-1, number))
    print(feats.shape)
    gram = tf.matmul(tf.transpose(feats), feats)  / N_SAMPLES
    style_loss = 2 * tf.nn.l2_loss(gram - style_gram)

     # Overall loss
    loss = content_loss + style_loss

    opt = tf.contrib.opt.ScipyOptimizerInterface(
          loss, method='L-BFGS-B', options={'maxiter':1000})
    #opt = tf.contrib.opt.ScipyOptimizerInterface(loss,method='SLSQP',options={'maxiter':1000})
    # Optimization
    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
       
        print('Started optimization.')
        opt.minimize(sess)
    
        print ('Final loss:', loss.eval())
        result = x.eval()
(1, 1, 500, 1025)
<bound method Tensor.get_shape of <tf.Tensor 'max_pooling2d/MaxPool:0' shape=(1, 1, 490, 4096) dtype=float32>>
('height=', 1)
('width=', 490)
('number=', 4096)
(490, 4096)
WARNING:tensorflow:From /home/mahidharv/.local/lib/python2.7/site-packages/tensorflow/python/util/tf_should_use.py:118: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Started optimization.
INFO:tensorflow:Optimization terminated with:
  Message: CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH
  Objective function value: 221.027771
  Number of iterations: 132
  Number of functions evaluations: 156
('Final loss:', 221.02777)

Inverse FFT to get back Audio

In [20]:
a = np.zeros_like(a_content)
a[:N_CHANNELS,:] = np.exp(result[0,0].T) - 1

# This code is supposed to do phase reconstruction
p = 2 * np.pi * np.random.random_sample(a.shape) - np.pi
for i in range(500):
    S = a * np.exp(1j*p)
    x = librosa.istft(S)
    p = np.angle(librosa.stft(x, N_FFT))

OUTPUT_FILENAME = './MidiBeethovenandMozart/Style4.wav'
librosa.output.write_wav(OUTPUT_FILENAME, x, fs)
In [31]:
from IPython.display import Audio,display
print( OUTPUT_FILENAME)
display(Audio(OUTPUT_FILENAME))
./MidiBeethovenandMozart/Style4.wav

Plotting the Results

In [21]:
plt.figure(figsize=(15,5))
plt.subplot(1,3,1)
plt.title('Content')
librosa.display.specshow(a_content, y_axis='log',x_axis='time')
plt.colorbar(format='%+2.0f dB')
#plt.imshow(a_content[:400,:])
plt.subplot(1,3,2)
plt.title('Style')
librosa.display.specshow(a_style, y_axis='log',x_axis='time')
plt.colorbar(format='%+2.0f dB')
#plt.imshow(a_style[:400,:])
plt.subplot(1,3,3)
plt.title('Result')
librosa.display.specshow(a, y_axis='log',x_axis='time')
plt.colorbar(format='%+2.0f dB')
#plt.imshow(a[:400,:])
plt.show()
© 2019 Copyright Built and deployed by Chaithanya, Mahidhar and Seema